We conducted the surveys located in the South and South East Asia, Kerala, India(Lat , Long), Indonesia (Lat , Long), Philippines (Lat , Long), Central Plain, Thailand (Lat , Long), and Mekong Delta Vietnam (Lat , Long). Theses are the important rice growing areas, where use irrigated lowland rice ecosystem. intensive condition, which grow twice per year
library(RCurl) # run this package for load the data form the website
file <- getURL("https://docs.google.com/spreadsheets/d/1zB7gNdI7Nk7SuHuPWcjzaKnjuwkvL6sOVMo0zMfuV-c/pub?gid=558862364&single=true&output=csv") # load data from the google drive
data <- read.csv(text = file) # read data which is formated as the csv
library(ggmap)
## Loading required package: ggplot2
mark.data <- data[, names(data) %in% c("Fno", "Lat", "Long")]
mapgilbert <- get_map(location = c(lon = mean(mark.data$Long), lat = mean(mark.data$Lat)),
zoom = 4, maptype = "satellite", scale = 2)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=7.743192,99.171764&zoom=4&size=640x640&scale=2&maptype=satellite&language=en-EN&sensor=false
ggmap(mapgilbert) + geom_point(data = mark.data, aes(x = Long, y = Lat,
fill = "red", alpha = 0.8), size = 3, shape = 21) + guides(fill = FALSE,
alpha = FALSE, size = FALSE)
# library(rworldmap) names(data) mark.data <- data[, names(data) %in%
# c('Fno', 'Lat', 'Long')] map <- get_map(location = 'India and
# Southeast Asia', zoom = 4) newmap <- getMap(resolution = 'low')
# plot(newmap, xlim = c(70, 120), ylim = c(-10, 20), asp = 1)
# points(mark.data$Long, mark.data$Lat, col = 'red', cex = .6)
The packages; dplyr, plyr , rehsape, reshape2, ggplot were applied for analysis the crop heal data (H. Wickham and Francois 2015; H. Wickham 2011; Wickham and Hadley 2007)
# ==============
library(dplyr) # arrange data structure
library(plyr)
library(reshape)
library(reshape2)
library(lubridate)
# ================
library(ggplot2) # plotting
library(gridExtra)
library(scales)
library(cowplot)
# ===============
library(bioDist) # Co-ocurrance analysis
library(vegan) # Co-ocurrance analysis
library(WGCNA)
## ==========================================================================
## *
## * Package WGCNA 1.47 loaded.
## *
## * Important note: It appears that your system supports multi-threading,
## * but it is not enabled within WGCNA in R.
## * To allow multi-threading within WGCNA with all available cores, use
## *
## * allowWGCNAThreads()
## *
## * within R. Use disableWGCNAThreads() to disable threading if necessary.
## * Alternatively, set the following environment variable on your system:
## *
## * ALLOW_WGCNA_THREADS=<number_of_processors>
## *
## * for example
## *
## * ALLOW_WGCNA_THREADS=4
## *
## * To set the environment variable in linux bash shell, type
## *
## * export ALLOW_WGCNA_THREADS=4
## *
## * before running R. Other operating systems or shells will
## * have a similar command to achieve the same aim.
## *
## ==========================================================================
# ==============
library(igraph) # Network analysis package
library(qgraph)
library(WGCNA)
Survey data were stored at the google drive under the this link.
data[data == "-"] <- NA # replace '-' with NA
data[data == ""] <- NA # replace 'missing data' with NA
# ==== to lower variable names ====
names(data) <- tolower(names(data)) # for more consistancy
# ======================================================================
# Select the injury variables remove the variables that are not
# included for analysis
data <- data[, !names(data) %in% c("phase", "identifier", "village","css", "ccd", "year", "season", "lat", "long", "fa", "fn",
"fp", "pc", "cem", "ast", "vartype", "varcoded", "fym", "fymcoded", "n", "p", "k", "mf", "wcp", "iu", "hu",
"fu", "cs", "ldg", "yield", "dscum", "wecum", "nplsqm", "npmax", "nltmax", "nlhmax", "lfm",
"ced", "cedjul", "hd", "hdjul", "cvr", "varcode", "fymcode", "mu", "nplsm", "rbpx")
]
#======================================================================
#=================== corract the variable type ========================
#======================================================================
data <- transform(data, country = as.factor(country), waa = as.numeric(waa),
wba = as.numeric(wba), dhx = as.numeric(dhx), whx = as.numeric(whx),
ssx = as.numeric(ssx), wma = as.numeric(wma), lfa = as.numeric(lfa),
lma = as.numeric(lma), rha = as.numeric(rha), thrx = as.numeric(thrx),
pmx = as.numeric(pmx), defa = as.numeric(defa), bphx = as.numeric(bphx),
wbpx = as.numeric(wbpx), awx = as.numeric(awx), rbx = as.numeric(rbx),
rbbx = as.numeric(rbbx), glhx = as.numeric(glhx), stbx = as.numeric(stbx),
hbx = as.numeric(hbx), bbx = as.numeric(bbx), blba = as.numeric(blba),
lba = as.numeric(lba), bsa = as.numeric(bsa), blsa = as.numeric(blsa),
nbsa = as.numeric(nbsa), rsa = as.numeric(rsa), lsa = as.numeric(lsa),
shbx = as.numeric(shbx), shrx = as.numeric(shrx), srx = as.numeric(srx),
fsmx = as.numeric(fsmx), nbx = as.numeric(nbx), dpx = as.numeric(dpx),
rtdx = as.numeric(rtdx), rsdx = as.numeric(rsdx), gsdx = as.numeric(gsdx),
rtx = as.numeric(rtx))
# clean the data
num.data <- apply(data[, -c(1, 2)], 2, as.numeric) # create dataframe to store the numerical transformation of raw data excluded fno and country
num.data <- as.data.frame(as.matrix(num.data)) # convert from vector to matrix
data <- cbind(data[, c("fno", "country")], num.data)
data <- data[, apply(data[, -c(1, 2)], 2, var, na.rm = TRUE) != 0] # exclude the column with variation = 0
data <- data[complete.cases(data), ] # exclude row which cantain NA
library(xtable)
tab <- xtable(head(data[1:10]))
print(tab, type = "html")
| fno | country | ntmax | waa | wba | dhx | whx | ssx | thrx | pmx | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | PHL | 18.00 | 78.00 | 49.00 | 108.00 | 107.00 | 2.00 | 0.00 | 0.00 |
| 2 | 2 | PHL | 17.30 | 32.00 | 4.00 | 120.00 | 7.00 | 2.00 | 0.00 | 0.00 |
| 3 | 3 | PHL | 19.40 | 42.00 | 4.00 | 18.00 | 40.00 | 2.00 | 0.00 | 0.00 |
| 4 | 4 | PHL | 18.90 | 48.00 | 4.00 | 18.00 | 29.00 | 2.00 | 0.00 | 0.00 |
| 5 | 5 | PHL | 19.80 | 71.00 | 49.00 | 58.00 | 33.00 | 2.00 | 0.00 | 0.00 |
| 6 | 6 | PHL | 19.20 | 21.00 | 56.00 | 14.00 | 5.00 | 2.00 | 0.00 | 0.00 |
m.data <- melt(data[, !names(data) %in% c("fno", "country")])
varnames <- colnames(data[, !names(data) %in% c("fno", "country")])
p <- list()
for (i in 1:length(varnames)) {
gdata <- m.data %>% filter(variable == varnames[i])
p[[i]] <- ggplot(gdata, aes(x = value)) + geom_histogram(stat = "bin") +
ggtitle(paste("Histogram of", varnames[i], sep = " "))
}
plot_grid(p[[1]], p[[2]], p[[3]], p[[4]], p[[5]], p[[6]], p[[7]], p[[8]],
p[[9]], p[[10]], p[[11]], p[[12]], p[[13]], p[[14]], p[[15]], p[[16]],
p[[17]], p[[18]], p[[19]], p[[20]], p[[21]], p[[22]], p[[23]], p[[24]],
p[[25]], p[[26]], p[[27]], p[[28]], p[[29]], p[[30]], p[[31]], ncol = 3,
align = "v")
The first step in the analysis is identify outlier sample using absolute hierarchical cluster analysis. After removing the outliers for analysis we would construct a weight networking weight
The analysis presented in this paper is desigened to capture the co-occurence patterns of pest injuries, weed infastratioon and disease wiht in geographoical levelos that are consistance. We considered both posisitve and negative co-occurence, whcih are from ranbks correlaitons (Spearman’s correaltion) between pair of a injury within ach sataset with the strangth of relationshop represent by tghe correlation coefficent. Negative correaltions (indicative of adverse occurence) were also inclused in this analsusis though thet were a samll suybetr of our combined datasets. We only considered negative and positive co-occruence relationship of correaltion coefficients where \(p\)-values less than p = 0.05.
Spearman’s correlation was used because it only check if two injuries are monotonically related, rather than having a linear relationship. As a result is less sensitive to difference in abundance, and this was desireable because abundance information may have . We employed Spearman correlation coefficents since distribution of the injuiores profiuels daa os 8unknow and ften does not satisfy the notmality condition.
We selected only the variables related to injury profiles, which are insect pests, weed infestrations, and diseases.
# head(data) select only the variables related to the injury profiles
start.IP <- "dhx" # set to read the data from column named 'dhx'
end.IP <- "rtx" # set to read the data from column named 'rtx'
start.col.IP <- match(start.IP, names(data)) # match function for check which column of the data mactch the column named following the conditons above
end.col.IP <- match(end.IP, names(data)) # match function for check which column of the data mactch the column named following the conditons above
IP.data <- data[start.col.IP:end.col.IP] # select the columns of raw data which are following the condition above
We clustered the survey fields to check if there are any obvious outliers.
# ==== pre-process before run correlaiton function because the
# correlaiton measures will not allow to perform with variables whihc
# have not variance.
IP.data <- IP.data[, apply(IP.data, 2, var, na.rm = TRUE) != 0] # exclude the column (variables) with variation = 0
#
sampleTree <- hclust(dist(IP.data[-1]), method = "average")
plot(sampleTree, main = "Sample clus")
# Plot a line to show the cut
abline(h = 265, col = "red")
We could noticed that it appears there is one outlier. One can remove it by hand, or use an automatic approach. We can remove the out-group by choose a height cut at 265 (the red line in the plot).
# Determine cluster under the line
clust = cutreeStatic(sampleTree, cutHeight = 265, minSize = 10)
table(clust)
## clust
## 1
## 420
# clust 1 contains the samples we want to keep.
keepSamples <- clust == 1
IP.data <- IP.data[keepSamples, ]
country <- data$country #combine two cloumn names country and PS
IP.data <- cbind(country, IP.data)
IP.data[is.na(IP.data)] <- 0
name.country <- as.vector(unique(IP.data$country))
# =====co_occurrence_pairwise.R====
country.results <- matrix(nrow = 0, ncol = 7) # create results to store the outputs
options(warnings = -1) # setting not to show the massages as the warnings
for (a in 1:length(name.country)) {
# subset the raw data by groups of countries and cluster of production
# situation
country.temp <- name.country[a]
# subset the dataset for those treatments
temp <- subset(IP.data, country == country.temp)
# in this case the community data started at column 6, so the loop for
# co-occurrence has to start at that point
for (b in 2:(dim(temp)[2] - 1)) {
# every species will be compared to every other species, so there has
# to be another loop that iterates down the rest of the columns
for (c in (b + 1):(dim(temp)[2])) {
# summing the abundances of species of the columns that will be
# compared
species1.ab <- sum(temp[, b])
species2.ab <- sum(temp[, c])
# if the column is all 0's no co-occurrence will be performed
if (species1.ab > 1 & species2.ab > 1) {
test <- cor.test(temp[, b], temp[, c], method = "spearman",
na.action = na.rm, exact = FALSE)
# There are warnings when setting exact = TRUE because of ties from the
# output of Spearman's correlation
# stackoverflow.com/questions/10711395/spear-man-correlation and ties
# It would be still valid if the data is not normally distributed.
rho <- test$estimate
p.value <- test$p.value
}
if (species1.ab <= 1 | species2.ab <= 1) {
rho <- 0
p.value <- 1
}
new.row <- c(name.country[a], names(temp)[b], names(temp)[c],
rho, p.value, species1.ab, species2.ab)
country.results <- rbind(country.results, new.row)
}
}
}
country.results <- data.frame(data.matrix(country.results))
names(country.results) <- c("country", "var1", "var2", "rho", "p.value",
"ab1", "ab2")
# making sure certain variables are factors
country.results$country <- as.factor(country.results$country)
country.results$var1 <- as.character(as.factor(country.results$var1))
country.results$var2 <- as.character(as.factor(country.results$var2))
country.results$rho <- as.numeric(as.character(country.results$rho))
country.results$p.value <- as.numeric(as.character(country.results$p.value))
country.results$ab1 <- as.numeric(as.character(country.results$ab1))
country.results$ab2 <- as.numeric(as.character(country.results$ab2))
head(country.results)
## country var1 var2 rho p.value ab1 ab2
## 1 PHL dhx whx 0.06524012 0.6891879742 1307 2089
## 2 PHL dhx ssx -0.12272044 0.4506097948 1307 109
## 3 PHL dhx defa 0.18989475 0.2405423584 1307 1941
## 4 PHL dhx bphx -0.11699678 0.4721633481 1307 1286
## 5 PHL dhx wbpx -0.50743562 0.0008316712 1307 84
## 6 PHL dhx awx 0.00000000 1.0000000000 1307 1
sub_country.results <- subset(country.results, as.numeric(as.character(p.value)) < 0.05)
results_sub.by.country <- list()
for(i in 1: length(name.country)){
results_sub.by.country[[i]] <- subset(sub_country.results, country == name.country[i])
}
### Network analysis
# head(results_sub.by.group[[1]][,2:3]) # get the list
#layout(matrix(c(1:14), 9, 2, byrow = TRUE))
#png("Netcountry.png", units="px", width=2400, height=3600, res=300)
layout(rbind(c(1,2), c(3,4), c(5,6)))
cnet <- list()
for (i in 1:length(name.country)) {
cnet[[i]] <- graph.edgelist(as.matrix(results_sub.by.country[[i]][, 2:3]), directed = FALSE)
l <- layout.circle(cnet[[i]])
V(cnet[[i]])$color <- adjustcolor("slateblue1", alpha.f = .6)
V(cnet[[i]])$frame.color <- adjustcolor("slateblue1", alpha.f = .6)
V(cnet[[i]])$shape <- "circle"
V(cnet[[i]])$size <- 20
V(cnet[[i]])$label.color <- "white"
V(cnet[[i]])$label.font <- 2
V(cnet[[i]])$label.family <- "Helvetica"
V(cnet[[i]])$label.cex <- 0.7
# == adjust the edge
E(cnet[[i]])$weight <- as.matrix(results_sub.by.country[[i]][, 4])
E(cnet[[i]])$width <- abs(E(cnet[[i]])$weight) * 10
col <- c("salmon", "forestgreen")
colc <- cut(results_sub.by.country[[i]]$rho, breaks = c(-1, 0, 1),
include.lowest = TRUE)
E(cnet[[i]])$color <- col[colc]
# == plot network model
plot(cnet[[i]], layout = l * 1, main = paste("network model of", name.country[i]))
}
#dev.off()
globe.network.value <- list()
ind.network.value <- list()
for (i in 1:length(cnet)) {
E(cnet[[i]])$weight <- abs(as.matrix(results_sub.by.country[[i]][, 4]))
adj.mat <- as.matrix(as_adj(cnet[[i]], attr = "weight"))
globe.network.value[[i]] <- data.frame(country = name.country[i],
Node = vcount(cnet[[i]]),
Link = ecount(cnet[[i]]),
diameter = diameter(cnet[[i]]),
avgConnect = mean(fundamentalNetworkConcepts(adj.mat)$ScaledConnectivity),
avgGeodesic = mean_distance(cnet[[i]]),
density = fundamentalNetworkConcepts(adj.mat)$Density,
smallworld = as.matrix(smallworldness(cnet[[i]]))[1],
centralization = fundamentalNetworkConcepts(adj.mat)$Centralization,
heterogeneity = fundamentalNetworkConcepts(adj.mat)$Heterogeneity
)
ind.network.value[[i]] <- data.frame(country = name.country[i],
node = V(cnet[[i]])$name,
connectivity = fundamentalNetworkConcepts(adj.mat)$ScaledConnectivity,
betweenness = centrality_auto(cnet[[i]])$node.centrality["Betweenness"]$Betweenness,
closeness = centrality_auto(cnet[[i]])$node.centrality["Closeness"]$Closeness,
clusterCoef = fundamentalNetworkConcepts(adj.mat)$ClusterCoef,
MAR = fundamentalNetworkConcepts(adj.mat)$MAR,
eigenvector = evcent(cnet[[i]])$vector
)
ind.network.value[[i]]$res = as.vector(lm(eigenvector ~ betweenness, data = ind.network.value[[i]])$residuals)
}
globe.network.value <- as.data.frame(do.call("rbind", globe.network.value))
ind.network.value <- as.data.frame(do.call("rbind", ind.network.value))
print(xtable(globe.network.value))
## % latex table generated in R 3.2.2 by xtable 1.7-4 package
## % Fri Oct 23 16:31:12 2015
## \begin{table}[ht]
## \centering
## \begin{tabular}{rlrrrrrrrrr}
## \hline
## & country & Node & Link & diameter & avgConnect & avgGeodesic & density & smallworld & centralization & heterogeneity \\
## \hline
## 1 & PHL & 21 & 33.00 & 1.85 & 0.28 & 2.26 & 0.07 & 0.80 & 0.19 & 0.89 \\
## 2 & VNM & 20 & 47.00 & 1.24 & 0.53 & 2.05 & 0.08 & 1.24 & 0.08 & 0.47 \\
## 3 & THA & 18 & 63.00 & 1.11 & 0.47 & 1.81 & 0.15 & 1.07 & 0.20 & 0.70 \\
## 4 & IDN & 22 & 48.00 & 1.40 & 0.46 & 2.25 & 0.06 & 1.24 & 0.07 & 0.64 \\
## 5 & IND & 10 & 17.00 & 1.38 & 0.55 & 2.02 & 0.14 & 1.72 & 0.14 & 0.51 \\
## \hline
## \end{tabular}
## \end{table}
row.names(globe.network.value) <- NULL
row.names(ind.network.value) <- NULL
globe.topo.table <- xtable(globe.network.value)
print(globe.topo.table, type = "html")
| country | Node | Link | diameter | avgConnect | avgGeodesic | density | smallworld | centralization | heterogeneity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | PHL | 21 | 33.00 | 1.85 | 0.28 | 2.26 | 0.07 | 0.80 | 0.19 | 0.89 |
| 2 | VNM | 20 | 47.00 | 1.24 | 0.53 | 2.05 | 0.08 | 1.24 | 0.08 | 0.47 |
| 3 | THA | 18 | 63.00 | 1.11 | 0.47 | 1.81 | 0.15 | 1.07 | 0.20 | 0.70 |
| 4 | IDN | 22 | 48.00 | 1.40 | 0.46 | 2.25 | 0.06 | 1.24 | 0.07 | 0.64 |
| 5 | IND | 10 | 17.00 | 1.38 | 0.55 | 2.02 | 0.14 | 1.72 | 0.14 | 0.51 |
One way to identify key actors in a network is to compare relative values of centrality such as eigenvector centrality and betweenness. Its apparent that many measures of centrality are correlated. If we assume a linear relationship between eigenvector centrality and betweeness and regress betweeness on eigenvector centrality, the residuals can be used to identify key players . A vertex or individual with higher levels of betweenness and lower EV centrality may be a critical gatekeeper or an individual that is central to the functioning of the network. Someone with lower levels of betweeness and higher EV centrality may have unique access to other individuals that are key to the functioning of the network.
A variables with high betweennesss and low eigenvector centralitu may be an important gatekeeper to central variable
A variable with low betweenness and and high eigenvector may have unique access to central variable
layout(matrix(c(1:9), 5, 2, byrow = TRUE))
## Warning in matrix(c(1:9), 5, 2, byrow = TRUE): data length [9] is not a
## sub-multiple or multiple of the number of rows [5]
for (i in 1:length(name.country)) {
plot <- ind.network.value %>% filter(country == name.country[i]) %>% ggplot(aes(x = betweenness,
y = eigenvector, label = node, color = res, size = abs(res))) +
xlab("Betweenness Centrality") + ylab("Eigencvector Centrality") +
geom_text() +
ggtitle(paste("Key Actor Analysis for Injuiry Profiles of", name.country[i]))
print(plot)
}
# just one graph
ind.network.value[is.na(ind.network.value)] <- 0
for( i in 1:length(name.country)){
g1 <- ind.network.value %>%
filter(country == name.country[i]) %>%
ggplot( aes(x= connectivity, y = reorder(node, connectivity))) +
geom_point(size = 3, color = "black") +
xlab("Connectivity") +
ylab("Variables") +
theme_bw() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_line(color = "black", linetype = 3))
g2 <- ind.network.value %>%
filter(country == name.country[i]) %>%
ggplot(aes(x= betweenness, y = reorder(node, betweenness))) +
geom_point(size = 3, color ="blue") +
xlab("Betweenness") +
ylab("Variables") +
theme_bw() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_line(color = "black", linetype = 3))
g3 <- ind.network.value %>%
filter(country == name.country[i]) %>%
ggplot(aes(x= closeness, y = reorder(node, closeness))) +
geom_point(size = 3, color ="red") +
xlab("Closeness") +
ylab("Variables") +
theme_bw() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_line(color = "black", linetype = 3))
g4 <- ind.network.value %>%
filter(country == name.country[i]) %>%
ggplot(aes(x= clusterCoef, y = reorder(node, clusterCoef))) +
geom_point(size = 3, color ="purple") +
xlab("Clustering coefficient") +
ylab("Variables") +
theme_bw() +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_line(color = "black", linetype = 3))
G <- plot_grid(g1, g2, g3, g4, labels = c("A", "B", "C", "D"), nrow = 1, align ="h")
print(G)
}
#png("Netfactoranalysis.png", units="px", width=1800, height=2700, res=300)
#layout(rbind(c(1,2), c(3,4), c(5,6)))
for(i in 1:length(cnet)){
l <- layout.circle(cnet[[i]])
V(cnet[[i]])$size <- abs(subset(ind.network.value, country == name.country[i])$res)*20
node <- as.character(V(cnet[[i]])$name)
node[which(abs(subset(ind.network.value, country == name.country[i])$res) < 0.25)] <- NA
plot(cnet[[i]], vertex.label = node,
vertex.label.dist = 0.5,
vertex.label.color = 'blue', edge.width = 1, main = paste("Key factor network model of", name.country[i]), margin = 0.1)
}
#dev.off()
# png("Netcountry.png", units="px", width=2400, height=1600, res=300)
# layout(rbind(c(1,2,3), c(4,5,6)))
# plot(x, main="100,000 points", col=adjustcolor("black", alpha=0.2))
# dev.off()
Wickham, Hadley. 2011. “The Split-Apply-Combine Strategy for Data Analysis.” Journal of Statistical Software 40 (1): 1–29. http://www.jstatsoft.org/v40/i01/.
Wickham, Hadley, and Romain Francois. 2015. Dplyr: A Grammar of Data Manipulation. http://CRAN.R-project.org/package=dplyr.
Wickham, and Hadley. 2007. “Reshaping Data with the Reshape Package.” Journal of Statistical Software 21 (12). http://www.jstatsoft.org/v21/i12/paper.